import pandas as pd
import numpy as np
import plotly.express as px
import plotly.io as pio
pio.renderers.default = "notebook_connected"
import altair as alt
alt.data_transformers.disable_max_rows()
import warnings
warnings.filterwarnings("ignore", category=FutureWarning)
Data Visualization Design ProjectΒΆ
DTSA 5304 - Spring 2025
Introduction - Goals, tasks, dataΒΆ
The basic motivation behind this visualization is to provide marathon runners with a simple tool to develop an overall understanding of the distribution of finish times at the Boston marathon. If successful, this visualization will allow users to quickly answer some basic questions about where they might rank based on their estimated finish time, or what pace or finish time would be required to achieve a certain place goal. Since the target audience is runners who are experienced or knowledgable enough to be interested in data about the Boston Marathon it is assumed that they have at least a basic understanding of how running data is displayed (for example - pace instead of speed), but they shouldn't need to understand any advanced statistics concepts.
The sample data for this visualization is the complete 2023 Boston Marathon results found at https://data.scorenetwork.org/running/boston_marathon_2023.html.
A few simple protoypes were developed and three experienced runners were recruited to provide feedback as part of a formative evaluation to develop the final visualization. While the subjects weren't specifically interested in running the Boston Marathon, they had the required domain knowledge to understand the context of the data and the tasks. The subject were prompted with:
Pretend that you are trying to qualify for the Boston Marathon or have recently qualified. You have a general idea of what your finish time might be. Using this visualization can you answer some basic questions for yourself or get a general idea about where you will finish overall or among your peers? What other questions does interacting with this visualization create?
First design prototypesΒΆ
A histogram of finish time bins was chosen to provide overall context for the distribution of finish times with an interactive hover tooltip to provide more specific information about each bin. The possiblity of filtering by gender or age category, either through an interactive filter like a dropdown or legend selection or small multiples, was also identified in this prototype.
Data Import and basic cleaningΒΆ
url = "https://data.scorenetwork.org/data/boston_marathon_2023.csv"
df = pd.read_csv(url)
df.head()
| age_group | place_overall | place_gender | place_division | name | team | bib_number | half_time | finish_net | finish_gun | gender | half_time_sec | finish_net_sec | finish_gun_sec | finish_net_minutes | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 18-39 | 1 | 1 | 1 | Chebet, Evans | Teamβ | 1 | 1H 2M 20S | 2H 5M 54S | 2H 5M 54S | M | 3740.0 | 7554 | 7554 | 125.900000 |
| 1 | 18-39 | 2 | 2 | 2 | Geay, Gabriel | Teamβ | 3 | 1H 2M 20S | 2H 6M 4S | 2H 6M 4S | M | 3740.0 | 7564 | 7564 | 126.066667 |
| 2 | 18-39 | 3 | 3 | 3 | Kipruto, Benson | Teamβ | 5 | 1H 2M 19S | 2H 6M 6S | 2H 6M 6S | M | 3739.0 | 7566 | 7566 | 126.100000 |
| 3 | 18-39 | 4 | 4 | 4 | Korir, Albert | Teamβ | 19 | 1H 2M 20S | 2H 8M 1S | 2H 8M 1S | M | 3740.0 | 7681 | 7681 | 128.016667 |
| 4 | 18-39 | 5 | 5 | 5 | Talbi, Zouhair | Teamβ | 31 | 1H 2M 20S | 2H 8M 35S | 2H 8M 35S | M | 3740.0 | 7715 | 7715 | 128.583333 |
#convert the finish_net_sec column to date time so we can treat it as a duration
df["finish_time"] = pd.to_datetime(df["finish_net_sec"], unit="s", origin="1970-01-01")
df.head()
| age_group | place_overall | place_gender | place_division | name | team | bib_number | half_time | finish_net | finish_gun | gender | half_time_sec | finish_net_sec | finish_gun_sec | finish_net_minutes | finish_time | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 18-39 | 1 | 1 | 1 | Chebet, Evans | Teamβ | 1 | 1H 2M 20S | 2H 5M 54S | 2H 5M 54S | M | 3740.0 | 7554 | 7554 | 125.900000 | 1970-01-01 02:05:54 |
| 1 | 18-39 | 2 | 2 | 2 | Geay, Gabriel | Teamβ | 3 | 1H 2M 20S | 2H 6M 4S | 2H 6M 4S | M | 3740.0 | 7564 | 7564 | 126.066667 | 1970-01-01 02:06:04 |
| 2 | 18-39 | 3 | 3 | 3 | Kipruto, Benson | Teamβ | 5 | 1H 2M 19S | 2H 6M 6S | 2H 6M 6S | M | 3739.0 | 7566 | 7566 | 126.100000 | 1970-01-01 02:06:06 |
| 3 | 18-39 | 4 | 4 | 4 | Korir, Albert | Teamβ | 19 | 1H 2M 20S | 2H 8M 1S | 2H 8M 1S | M | 3740.0 | 7681 | 7681 | 128.016667 | 1970-01-01 02:08:01 |
| 4 | 18-39 | 5 | 5 | 5 | Talbi, Zouhair | Teamβ | 31 | 1H 2M 20S | 2H 8M 35S | 2H 8M 35S | M | 3740.0 | 7715 | 7715 | 128.583333 | 1970-01-01 02:08:35 |
df['finish_time'] = df['finish_time'].astype('datetime64[s]')
# Group timestamps into 5-minute bins
df["binned_time"] = df["finish_time"].dt.ceil("5min")
Interaction and basic layout prototypesΒΆ
Once the data was cleaned and formatted a simple interactive distribution chart was developed with plotly as a proof of concept. This chart was used to determine if a histogram chart was appropriate and practical and what bin sizes would be most useful.
simple_fig = px.histogram(
x = df['finish_time'],
nbins = 100
)
# Format x-axis to show HH:MM:SS
simple_fig.update_layout(
xaxis_title="Finish Time (HH:MM:SS)",
xaxis=dict(tickformat="%H:%M:%S")
)
simple_fig
Grouping prototypesΒΆ
Since I knew that the visualization would have group level filtering in the tooltip and possibly other interactions I started by creating a new dataframe with grouped data. At this point I also switched to Altair since it provides the type of filtering and interaction I wanted.
Grouped df codeΒΆ
# Create all group counts at once
gender_groups = df.pivot_table(
index="binned_time",
columns="gender",
aggfunc="size",
fill_value=0
).rename(columns={'M': 'Male', 'W': 'Female'})
age_groups = df.pivot_table(
index="binned_time",
columns="age_group",
aggfunc="size",
fill_value=0
)
# Combine all groups
grouped_df = pd.concat([
df.groupby("binned_time").size().rename("All"),
gender_groups,
age_groups
], axis=1).reset_index()
grouped_df['binned_time_low'] = grouped_df['binned_time'] - pd.Timedelta(minutes=5)
# For just the 'All' column
grouped_df['All_cumsum'] = grouped_df['All'].cumsum()
# Add cumulative sum columns for all count columns
grouped_df['All_cumsum'] = grouped_df['All'].cumsum()
grouped_df['Male_cumsum'] = grouped_df['Male'].cumsum()
grouped_df['Female_cumsum'] = grouped_df['Female'].cumsum()
total_finishers = grouped_df['All'].sum()
male_finishers = grouped_df['Male'].sum()
female_finishers = grouped_df['Female'].sum()
# add percentile columns for All/male/femail
grouped_df['Total_percentile'] = (grouped_df['All_cumsum'] / total_finishers)
grouped_df['male_percentile'] = (grouped_df['Male_cumsum'] / male_finishers)
grouped_df['female_percentile'] = (grouped_df['Female_cumsum'] / female_finishers)
grouped_df.head()
| binned_time | All | Male | Female | 18-39 | 40-44 | 45-49 | 50-54 | 55-59 | 60-64 | ... | 70-74 | 75-79 | 80+ | binned_time_low | All_cumsum | Male_cumsum | Female_cumsum | Total_percentile | male_percentile | female_percentile | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1970-01-01 02:10:00 | 8 | 8 | 0 | 8 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 1970-01-01 02:05:00 | 8 | 8 | 0 | 0.000301 | 0.000527 | 0.000000 |
| 1 | 1970-01-01 02:15:00 | 14 | 14 | 0 | 14 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 1970-01-01 02:10:00 | 22 | 22 | 0 | 0.000827 | 0.001450 | 0.000000 |
| 2 | 1970-01-01 02:20:00 | 6 | 6 | 0 | 6 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 1970-01-01 02:15:00 | 28 | 28 | 0 | 0.001053 | 0.001845 | 0.000000 |
| 3 | 1970-01-01 02:25:00 | 47 | 32 | 15 | 44 | 3 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 1970-01-01 02:20:00 | 75 | 60 | 15 | 0.002820 | 0.003954 | 0.001313 |
| 4 | 1970-01-01 02:30:00 | 71 | 66 | 5 | 59 | 11 | 1 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 1970-01-01 02:25:00 | 146 | 126 | 20 | 0.005489 | 0.008303 | 0.001751 |
5 rows Γ 21 columns
First Altair FigureΒΆ
# Create Altair bar chart (looks like a histogram)
alt_fig = alt.Chart(grouped_df).mark_bar().encode(
x=alt.X("binned_time:T", title="Finish Time (HH:MM:SS)", axis=alt.Axis(format="%H:%M:%S")), # X-axis is timestamp
y=alt.Y("All:Q", title="Count"),
tooltip=[
alt.Tooltip("binned_time_low:T", title="Finish between", format="%H:%M:%S"),
alt.Tooltip("binned_time:T", title="and", format="%H:%M:%S"),
alt.Tooltip("All:Q", title="Number of finishers"),
alt.Tooltip("Total_percentile:Q", title="% Rank", format=".0%"),
]
).properties(
title="Finish Time Distribution (5-Minute Groups)",
width=800
).interactive()
alt_fig
I did test this visual with my users - but with no specific context or task prompt. I wanted to understand if they generally understood what the chart was showing and if the data made sense to them.
While the grouped dataframe approach did make it easy to created binned histogram with some basic tooltips, it also became clear while developing this prototype that doing so would limit a lot of dynamic interaction that I originally wanted. For example, using the grouped dataframe as the source data would make it exceptionally difficult to have the hover tooltip update to reflect only gender-specific data when the user hovered over a gender specific bar. So for those reasons I abandoned the grouped dataframe approach and instead decided to manipulated the original dataframe that had individual finisher results.
Feedback prototypeΒΆ
At this point I'd learned enough to develop the first version that I'd test with my users with the task and context prompt.
Additional columns and formattingΒΆ
As an aside - Altair and most python and R visualization libraries have very limited or no ability to handle duration (timedelta) datatypes. That's whay there is a lot of extra code to make datatimes look like durations or to turn other datatypes into strings for proper display in the tooltips and on the axes.
df = df.sort_values(by=["finish_net_sec"]) # Sort by finish time, ignoring gender
df = df.reset_index(drop=True)
# Round up finish_time to the nearest 5-minute (300s) interval
df["rounded_finish_time_high"] = np.ceil(df["finish_net_sec"] / 300) * 300
df["rounded_finish_time_low"] = df["rounded_finish_time_high"] - 300
# Convert rounded seconds to HH:MM:SS format
df["finish_duration_bin_high"] = pd.to_datetime(df["rounded_finish_time_high"], unit="s").dt.strftime("%H:%M:%S")
df["finish_duration_bin_low"] = pd.to_datetime(df["rounded_finish_time_low"], unit="s").dt.strftime("%H:%M:%S")
df["percentile_overall"] = df["place_overall"].rank(pct=True)
# create overall cumsum column
df["cumulative_count"] = df.index + 1
# create pace column in minutes
df["pace_group"] = df["rounded_finish_time_high"] / 60 / 26.22
# create pace bucket column
df["pace_bin_string"] = (df["rounded_finish_time_high"] / 26.22 ).apply(lambda x: "{:02}:{:02}".format(*divmod(int(x), 60)))
# try to create good total count per finish bin
df["total_finishers_bin"] = df.groupby("rounded_finish_time_high")["rounded_finish_time_high"].transform("count")
# make finish range column
df["finish_range"] = df["finish_duration_bin_low"] + " - " + df["finish_duration_bin_high"]
# create a bin cumulative count cutoff that is gender neutral
df["cumulative_finishers_binned"] = df.groupby("finish_range")["cumulative_count"].transform("max")
df["gender_cumulative_count"] = df.groupby("gender")["finish_net_sec"].rank(method="first").astype(int)
df.head()
| age_group | place_overall | place_gender | place_division | name | team | bib_number | half_time | finish_net | finish_gun | ... | finish_duration_bin_high | finish_duration_bin_low | percentile_overall | cumulative_count | pace_group | pace_bin_string | total_finishers_bin | finish_range | cumulative_finishers_binned | gender_cumulative_count | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 18-39 | 1 | 1 | 1 | Chebet, Evans | Teamβ | 1 | 1H 2M 20S | 2H 5M 54S | 2H 5M 54S | ... | 02:10:00 | 02:05:00 | 0.000038 | 1 | 4.958047 | 04:57 | 8 | 02:05:00 - 02:10:00 | 8 | 1 |
| 1 | 18-39 | 2 | 2 | 2 | Geay, Gabriel | Teamβ | 3 | 1H 2M 20S | 2H 6M 4S | 2H 6M 4S | ... | 02:10:00 | 02:05:00 | 0.000075 | 2 | 4.958047 | 04:57 | 8 | 02:05:00 - 02:10:00 | 8 | 2 |
| 2 | 18-39 | 3 | 3 | 3 | Kipruto, Benson | Teamβ | 5 | 1H 2M 19S | 2H 6M 6S | 2H 6M 6S | ... | 02:10:00 | 02:05:00 | 0.000113 | 3 | 4.958047 | 04:57 | 8 | 02:05:00 - 02:10:00 | 8 | 3 |
| 3 | 18-39 | 4 | 4 | 4 | Korir, Albert | Teamβ | 19 | 1H 2M 20S | 2H 8M 1S | 2H 8M 1S | ... | 02:10:00 | 02:05:00 | 0.000150 | 4 | 4.958047 | 04:57 | 8 | 02:05:00 - 02:10:00 | 8 | 4 |
| 4 | 18-39 | 5 | 5 | 5 | Talbi, Zouhair | Teamβ | 31 | 1H 2M 20S | 2H 8M 35S | 2H 8M 35S | ... | 02:10:00 | 02:05:00 | 0.000188 | 5 | 4.958047 | 04:57 | 8 | 02:05:00 - 02:10:00 | 8 | 5 |
5 rows Γ 29 columns
User tested prototypeΒΆ
This is the version I tested with my users with the context and task prompts.
# Altair Chart with Correct Binning
chart = alt.Chart(df).mark_bar().encode(
alt.X("finish_net_sec:Q")
.bin(step=300, extent=[2*3600, 7*3600]) # 5-minute bins, from 2:00 AM to 7:00 AM
.scale(domain=[2*3600,7*3600]) # Keep the x-axis in the correct range
.axis(
title="Finish Time (HH:MM:SS)",
labelExpr="timeFormat((datum.value - (17 * 3600)) * 1000, '%H:%M:%S')" # Format as HH:MM:SS
),
y="count()",
color="gender",
tooltip=[
alt.Tooltip("finish_duration_bin_low:N",
title="Finish between "),
alt.Tooltip("finish_duration_bin_high:N",
title="and "),
alt.Tooltip("max(percentile_overall):Q",
title="Overall percentile",
format=".1%")
]
).properties(
width=800
).interactive()
chart